package com.a.b.c.d.e.webmagic.processors;

import lombok.extern.slf4j.Slf4j;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ObjectUtils;
import org.springframework.util.StringUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Created on 2021/9/15.
 *
 * @author yanshuchun
 */
@Slf4j
public class NewsProcessor implements PageProcessor {
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
    @Override
    public void process(Page page) {
        Html allHtml = page.getHtml();
        String currentUrl = page.getUrl().get();
        if(currentUrl.matches("http://zjj.liaocheng.gov.cn/")){

        }else {

        }
//        if(!currentUrl.contains("index")&&!page.getUrl().regex(".*/\\d{6}/\\w\\d{8}/*\\w\\d{7}.html").match()){
//            page.addTargetRequests(makeHelpUrl(allHtml.css(".red > script:nth-child(1)").get(),currentUrl));
//        }
        page.addTargetRequests(allHtml.links().regex(".*/\\d{6}/\\w\\d{8}/*\\w\\d{7}.html").all());

        String title = allHtml.css(".news-cont > h2:nth-child(1)","text").get();
        String content = allHtml.css("#Zoom").get();
        //构造存储数据
        if(!ObjectUtils.isEmpty(title)&&!ObjectUtils.isEmpty(content)){
            //            get page: http://zjj.liaocheng.gov.cn/xwzx_14153/sdyw/202111/t20211130_3786533.html
            page.putField("url",currentUrl);
            page.putField("title",title);
            String newsDate = allHtml.css(".news-info > span:nth-child(2)","text").get();
        }
    }


    public List<String> dealPage(String pager,String s){
        List<String> helpUrls = new ArrayList<>();
        if(ObjectUtils.isEmpty(pager)){
           return helpUrls;
        }
        pager = pager.replaceAll(" ","").replaceAll("\"","");
        if(!pager.contains("createPageHTML(")||!pager.contains(");")){
            return helpUrls;
        }
       try {
           String substring = pager.substring(pager.indexOf("(")+1,pager.indexOf(");"));
           String[] split = substring.split(",");
           Integer total = Integer.valueOf(split[0]);
           for (int i = 1; i < total; i++) {
//               System.out.println(s);
               System.out.println(s+split[2]+"_"+i+"."+split[3]);

           }

       }catch (Exception e){
           e.printStackTrace();
       }finally {
           return helpUrls;
       }
    }

    @Override
    public Site getSite() {
        System.out.println("获取访问站点++++++++++++++++++++++++++++++++++++++++++++++++++++++++");
        return site;
    }
}
